NeXT Education Software Sampler 1992 Fall

home *** CD-ROM | disk | FTP | other *** search

/ NeXT Education Software Sampler 1992 Fall / NeXT Education Software Sampler 1992 Fall.iso / Programming / Source / WAIS / ir / irhash.c < prev next >

Wrap

C/C++ Source or Header | 1992-02-02 | 22.6 KB | 714 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* The memory hashtables for building an index. */ /* -brewster 5/90 */ /* main functions: * add_word * finished_add_word * look_up_word * * The idea is to store up a bunch of words before going to disk. * A word entry points to where it will go on disk, and * accumulates the entries before doing it. * * Some of the policy issues in this file are: * How much weight should the first occurance of a word in a document get * over the other occurances. The first occurance should be worth more * so that words with 3 occurances of "dog" and not "cat"'s should not * win out over 1 "dog" and 1 "cat" if the question is "Tell me about cats * torture dogs" * The extra weight is 5 at this point. * */ /* To Do: * Improve the hashing functions. * done: stop inserting into hash table after max number have been accumulated * done: make flush not flush buffers that are too big. */ #include <ctype.h> #include <string.h> /* for strlen(), memset() */ #include "panic.h" #include "cutil.h" #include "irfiles.h" #include "irhash.h" #include "stoplist.h" #include "irinv.h" #ifdef UNIX #define PRINT_AS_INDEXING true /* also defined in irtfiles.c and irfiles.c */ #else #define PRINT_AS_INDEXING false #endif /*===========================* *=== Hashing Functions ===* *===========================*/ /* #define FAST_HASH */ #ifdef FAST_HASH /* courtesy ses@ccgr.technion.ac.il, but it turns out in informal timings that it increases the index time. sigh. */ static char coeff[] = { 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1, 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1, 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1, 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1, 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1, 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1, 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1, 61,59,53,47,43,41,37,31,29,23,17,13,11,7,3,1}; long hash_word(wd,below_n) char *wd; long below_n; { register char *foo; register long hash = 0; register int l; for(l=0,foo=wd;l<sizeof(coeff) && *foo ;l++) hash = hash + (*(foo++) * coeff[l]); return (hash % below_n); } #endif /* def FAST_HASH*/ #ifndef FAST_HASH /* these stink -brewster */ static long random_array_3[256] = {142L, 176L, 108L, 210L, 109L, 223L, 214L, 251L, 102L, 86L, 91L, 9L, 247L, 139L, 115L, 71L, 63L, 35L, 126L, 77L, 209L, 175L, 120L, 28L, 44L, 198L, 21L, 125L, 245L, 250L, 10L, 119L, 127L, 60L, 81L, 226L, 216L, 182L, 172L, 72L, 151L, 178L, 116L, 224L, 244L, 41L, 212L, 73L, 190L, 248L, 173L, 18L, 82L, 27L, 97L, 26L, 79L, 169L, 74L, 170L, 83L, 189L, 101L, 141L, 230L, 55L, 135L, 220L, 187L, 201L, 95L, 39L, 186L, 131L, 105L, 36L, 255L, 203L, 155L, 84L, 160L, 75L, 254L, 235L, 51L, 243L, 158L, 14L, 148L, 167L, 149L, 96L, 68L, 161L, 45L, 233L, 11L, 19L, 3L, 38L, 195L, 48L, 144L, 15L, 171L, 94L, 180L, 29L, 252L, 181L, 80L, 4L, 20L, 213L, 23L, 143L, 7L, 236L, 76L, 110L, 22L, 58L, 17L, 253L, 66L, 246L, 40L, 112L, 179L, 130L, 87L, 124L, 240L, 193L, 107L, 165L, 202L, 31L, 106L, 43L, 93L, 99L, 147L, 199L, 129L, 197L, 32L, 229L, 150L, 46L, 157L, 128L, 136L, 153L, 121L, 113L, 237L, 194L, 218L, 104L, 78L, 184L, 62L, 159L, 227L, 222L, 47L, 53L, 1L, 24L, 118L, 177L, 49L, 185L, 98L, 90L, 34L, 192L, 200L, 221L, 232L, 146L, 114L, 137L, 67L, 225L, 154L, 241L, 50L, 56L, 145L, 5L, 188L, 207L, 231L, 228L, 6L, 183L, 219L, 217L, 156L, 30L, 174L, 205L, 103L, 37L, 133L, 152L, 117L, 196L, 164L, 249L, 239L, 64L, 242L, 59L, 168L, 2L, 162L, 13L, 92L, 85L, 70L, 0L, 52L, 65L, 166L, 163L, 215L, 69L, 140L, 25L, 33L, 100L, 42L, 54L, 88L, 206L, 122L, 57L, 16L, 208L, 134L, 132L, 138L, 89L, 8L, 234L, 12L, 238L, 111L, 204L, 61L, 211L, 191L, 123L}; static long random_array_2[256] = {818L, 789L, 854L, 862L, 704L, 1019L, 390L, 887L, 93L, 204L, 269L, 59L, 743L, 219L, 191L, 769L, 911L, 435L, 805L, 448L, 142L, 1000L, 149L, 264L, 639L, 504L, 699L, 934L, 266L, 661L, 318L, 211L, 117L, 549L, 90L, 536L, 378L, 944L, 400L, 599L, 592L, 883L, 985L, 606L, 759L, 456L, 581L, 119L, 106L, 310L, 412L, 931L, 233L, 561L, 973L, 870L, 377L, 349L, 334L, 354L, 249L, 585L, 799L, 899L, 545L, 553L, 848L, 625L, 438L, 890L, 791L, 1014L, 337L, 374L, 489L, 146L, 123L, 907L, 977L, 22L, 396L, 241L, 198L, 424L, 136L, 715L, 867L, 684L, 560L, 244L, 293L, 1017L, 397L, 778L, 725L, 78L, 184L, 656L, 389L, 635L, 982L, 158L, 203L, 878L, 323L, 394L, 73L, 18L, 837L, 996L, 58L, 62L, 161L, 451L, 534L, 746L, 485L, 222L, 25L, 666L, 28L, 21L, 420L, 147L, 522L, 74L, 474L, 362L, 253L, 172L, 195L, 622L, 559L, 790L, 288L, 455L, 263L, 538L, 355L, 417L, 810L, 576L, 685L, 797L, 641L, 315L, 347L, 786L, 487L, 966L, 579L, 181L, 499L, 429L, 688L, 140L, 278L, 719L, 186L, 872L, 997L, 319L, 173L, 882L, 1008L, 573L, 431L, 830L, 774L, 654L, 235L, 121L, 925L, 529L, 593L, 92L, 954L, 434L, 213L, 79L, 284L, 510L, 763L, 655L, 300L, 447L, 4L, 461L, 506L, 88L, 99L, 459L, 220L, 780L, 523L, 178L, 303L, 578L, 287L, 827L, 419L, 521L, 114L, 703L, 664L, 892L, 304L, 876L, 352L, 331L, 35L, 896L, 341L, 450L, 812L, 350L, 316L, 705L, 815L, 935L, 15L, 572L, 503L, 467L, 306L, 976L, 118L, 760L, 807L, 809L, 339L, 442L, 758L, 546L, 327L, 527L, 537L, 383L, 82L, 531L, 728L, 428L, 768L, 675L, 814L, 919L, 133L, 682L, 906L, 163L, 716L, 692L, 174L, 464L, 708L, 922L}; /* static long random_char_code _AP((long ch,long offset)); static long random_char_code(ch,offset) long ch; long offset; { return(random_array_3[ (offset + (ch & 0xFF)) % 256]); } */ #define random_char_code(ch,offset)\ (random_array_3[ (offset + (ch ) ) & 0xff]) /* assumes the word has been downcased already */ static long hash_word(wd,below_n) char *wd; long below_n; { register long i=0; register long answer = 0; register char* foo; foo=wd; for (i = 0; *foo; foo++,i++) { answer = answer ^ (random_array_2[i % 256] + ((0 == (i & 1)) ? random_char_code((long)*foo, i) : (random_char_code((long)*foo, i)) << 8)); } return(answer % below_n); } #endif /* ndef FAST_HASH */ static long hash_word_2 _AP((char *wd)); static long hash_word_2(wd) char *wd; { long hash = hash_word(wd, ((1L << (8 * DICTIONARY_ENTRY_HASH_CODE_SIZE)) - 2)); return(1 + hash); } /* ================================ === Word Occurance Buffers === ================================ */ /* Word occurance buffers * This is a simple memory allocator for use with the word memory hashtable. * Since the buffers are tiny, this is done as a copy-sweep GC scheme. * Oh, I long for the storage system of lisp. */ char *first_word_occurance_buffer = NULL; /* allocate blocks out of this */ char *last_word_occurance_buffer = NULL; long word_occurance_block_length = 256000; /* maybe this should be larger? */ char * word_occurance_free_ptr = NULL; char *make_word_occurrance_block(size) long size; { /* allocates a word_occurance_block out of the buffers */ /* old way: s_malloc((size_t)size); */ /* returns a pointer to a piece of memory */ if(NULL == first_word_occurance_buffer){ /* initialize it */ first_word_occurance_buffer = (char *)s_malloc(MAX(word_occurance_block_length, sizeof(size_t)+ size)); *(char **)first_word_occurance_buffer = NULL; /* set the end */ last_word_occurance_buffer = first_word_occurance_buffer; word_occurance_free_ptr = first_word_occurance_buffer + sizeof(size_t); } if((long)word_occurance_free_ptr + size >= word_occurance_block_length + (long)last_word_occurance_buffer){ /* then allocate a new block */ char * new_block = (char *)s_malloc(MAX(word_occurance_block_length, sizeof(size_t)+ size)); *(char **)new_block = NULL; /* set the end of the chain */ *(char **)last_word_occurance_buffer = new_block; word_occurance_free_ptr = new_block + sizeof(size_t); last_word_occurance_buffer = new_block; } /* allocate away */ { char * answer = word_occurance_free_ptr; word_occurance_free_ptr += size; return(answer); } } void free_word_occurance_block(block) char *block; { /* this is not used with the new scheme, but is here in case malloc is a win on some systems */ /* old way s_free(block); */ } static void flush_word_occur_bufs_internal _AP((char* head_of_list)); static void flush_word_occur_bufs_internal(head_of_list) char* head_of_list; /* frees all word occurance buffers. This should be done with care */ { while(1){ char * next_block; if(NULL == head_of_list) break; next_block = *(char **)head_of_list; s_free(head_of_list); head_of_list = next_block; } } void flush_word_occurance_buffers() { /* frees all word occurance buffers. This should be done with care */ flush_word_occur_bufs_internal(first_word_occurance_buffer); first_word_occurance_buffer = NULL; word_occurance_free_ptr = NULL; last_word_occurance_buffer = NULL; } void gc_word_occurance_buffers(the_word_memory_hashtable) word_memory_hashtable * the_word_memory_hashtable; { /* go through the word_memory_hashtable and copy what we need into another list of buffers, the flush the old ones */ /* not needed yet */ } /* =============================== === Word Memory Hashtable === =============================== */ static long find_location _AP((char* word,word_memory_hashtable* the_word_memory_hashtable)); static long find_location(word,the_word_memory_hashtable) char* word; word_memory_hashtable* the_word_memory_hashtable; /* returns the location that the word should go (or is). returns -1 if * the hashtable is full and the word is not there */ { long hash_code = hash_word(word, the_word_memory_hashtable->size); long i; long hash_code_2 = hash_word_2(word); for(i = hash_code; i < (hash_code + the_word_memory_hashtable->size); i++){ long index = i % the_word_memory_hashtable->size; if(NULL == the_word_memory_hashtable->contents[index]){ /* found an open spot, return it */ return(index); } else if(hash_code_2 == the_word_memory_hashtable->contents[index]->hash_code && strcmp(word, the_word_memory_hashtable->contents[index]->word) == 0){ /* we win, return it */ return(index); } /* keep looking */ } return(-1); } /* this pushes all word entries to the top of the word_memory_hashtable * therefore messing up the hashing order, but allows for quick sorting * just before dumping to disk. */ void collapse_word_memory_hashtable(the_word_memory_hashtable) word_memory_hashtable *the_word_memory_hashtable; { long insert_index = 0; long extract_index; for(extract_index = 0; extract_index < the_word_memory_hashtable->size; extract_index++){ word_entry *entry = the_word_memory_hashtable->contents[extract_index]; if(NULL != entry) the_word_memory_hashtable->contents[insert_index++] = entry; } } static int word_entry_compare _AP((word_entry**i,word_entry** j)); static int word_entry_compare(i,j) word_entry **i; word_entry **j; { return(strcmp((*i)->word, (*j)->word)); } /* assumes that the word_memory_hashtable has been compressed */ void sort_word_memory_hashtable(the_word_memory_hashtable) word_memory_hashtable *the_word_memory_hashtable; { qsort(the_word_memory_hashtable->contents, the_word_memory_hashtable->number_of_entries, (size_t)sizeof(char *), word_entry_compare); } /* for debugging */ void print_word_memory_hashtable(the_word_memory_hashtable) word_memory_hashtable* the_word_memory_hashtable; { if (NULL == the_word_memory_hashtable){ cprintf(PRINT_AS_INDEXING, "No Hashtable allocated\n"); return; } cprintf(PRINT_AS_INDEXING, "Number of entries possible: %ld\n", the_word_memory_hashtable->size); cprintf(PRINT_AS_INDEXING, "Number of entries allocated: %ld\n", the_word_memory_hashtable->number_of_entries); if(NULL != the_word_memory_hashtable->contents){ long i; /* print the entries */ printf("The entries are:\n"); for(i = 0; i < the_word_memory_hashtable->size; i++){ if(NULL != the_word_memory_hashtable->contents[i]){ printf(" Position: %ld word: \"%s\" %ld occurances\n", i, the_word_memory_hashtable->contents[i]->word, the_word_memory_hashtable->contents[i]->number_of_occurances); } } } } static word_entry* look_up_word _AP((char* word,word_memory_hashtable* the_word_memory_hashtable)); static word_entry* look_up_word(word,the_word_memory_hashtable) char* word; word_memory_hashtable* the_word_memory_hashtable; { /* looks up the word in the dictionary and returns * a pointer to the word_entry. * If is not present, then it mallocs a new word entry. */ /* this is a pretty dumb hashing scheme XXX */ long index = find_location(word, the_word_memory_hashtable); if(-1 == index){ panic("the hashtable is completely full. It should have been grown\n"); } if(NULL == the_word_memory_hashtable->contents[index]){ /* make a new entry */ word_entry *new_entry = &the_word_memory_hashtable->word_entry_block [the_word_memory_hashtable->number_of_entries++]; if(NULL == new_entry){ panic("malloc failed for word_entry\n"); } strncpy(new_entry->word, word, MAX_WORD_LENGTH); new_entry->hash_code = hash_word_2(word); new_entry->number_of_occurances = 0; new_entry->memory_ptr = make_word_occurrance_block(WORD_MEMORY_INIT_BLOCK_SIZE); new_entry->current_memory_ptr = new_entry->memory_ptr; new_entry->memory_size = WORD_MEMORY_INIT_BLOCK_SIZE; new_entry->current_doc_id = 0; the_word_memory_hashtable->contents[index] = new_entry; return(new_entry); } else{ return(the_word_memory_hashtable->contents[index]); } } static unsigned char add_weight _AP((long current_weight,long new_weight)); static unsigned char add_weight(current_weight,new_weight) long current_weight; long new_weight; /* add a new weight to the existing one */ { /* this should be smarter than this, like doing the log or something */ if(127 < (current_weight + new_weight)){ /* the max char. should be 255, but does not work on all compilers */ return(127); } else{ return(current_weight + new_weight); } } static char* more_memory _AP((char* current_memory_ptr, long current_memory_size, long new_size)); static char* more_memory(current_memory_ptr,current_memory_size,new_size) char* current_memory_ptr; long current_memory_size; long new_size; /* Allocates more memory for a word_entry. It transfers all the bytes * from the old to the new and then returns the new. */ { char* new_memory = NULL; if(current_memory_size > new_size){ panic("trying to contract a word_entry block. This is not right\n"); } new_memory = make_word_occurrance_block(new_size); if(NULL == new_memory){ panic("Out of memory."); } memset(new_memory, 0, new_size); memmove(new_memory, current_memory_ptr, (size_t)current_memory_size); return(new_memory); } static long more_memory_size _AP((long current_size, long number_of_occurances)); static long more_memory_size(current_size,number_of_occurances) long current_size; long number_of_occurances; /* This is pretty important to get right. This is a place holder */ { return(MAX(2 * current_size, WORD_MEMORY_INIT_BLOCK_SIZE)); } static long write_bytes_to_memory _AP((long value,long size,char* ptr)); static long write_bytes_to_memory(value,size,ptr) long value; long size; char* ptr; { /* writes the number into memory lsb first. returns the number of bytes written */ long i; if(size < 0) /* paranoia */ panic("attempting to write a negative number of bytes"); ptr += size; /* start at the end of the block and write backwards */ for (i = 0; i < size; i++){ ptr--; *ptr = value & 0xFF; value = value >> 8; } return(size); } /* adds a word to the word_memory_hashtable. Currently it * ignores the character position XXX. * Returns the 0 if successful. See irext.h for more documentation. */ long add_word(word, char_pos, line_pos, weight, doc_id, date, db) char *word; /* the word to be indexed, this could be a word pair. If NULL there are no more words to be indexed */ long char_pos; /* the position of the start of the word */ long line_pos; /* this is passed for the best section calculation */ long weight; /* how important the word looks syntactically (such as is it bold) NOT used by signature system */ long doc_id; /* current document, this will never be 0 */ time_t date; /* display day of this document, 0 if not known */ database* db; /* database to insert the document */ { /* look up the word in the word_memory_hashtable */ /* creates it if necessary */ word_entry* wrd_entry; word_memory_hashtable * the_word_memory_hashtable = db->the_word_memory_hashtable; /* printf("Word: '%s' doc_id: %ld, pos: %ld, weight: %ld\n", word, doc_id, char_pos, weight); */ if(NULL == db->the_word_memory_hashtable){ panic("The memory word hashtable is not defined."); } /* if we have filled up the hashtable, or if we have indexed enough words flush the memory copies to disk */ if((the_word_memory_hashtable->number_of_entries == the_word_memory_hashtable->word_entry_block_size) || (the_word_memory_hashtable->number_of_words_indexed == the_word_memory_hashtable->flush_after_n_words)) flush_memory_hashtable_to_disk(db, false); the_word_memory_hashtable->number_of_words_indexed ++; wrd_entry = look_up_word(word, the_word_memory_hashtable); wrd_entry->number_of_occurances ++; if(wrd_entry->number_of_occurances > MAX_OCCURANCES){ /* do nothing. we have enough of that word */ } else{ /* we have a word to add */ if(doc_id != wrd_entry->current_doc_id){ /* then we have a new doc_id to add to the memory block */ wrd_entry->current_doc_id = doc_id; /* check to see if we need more memory */ if((wrd_entry->memory_size - (wrd_entry->current_memory_ptr - wrd_entry->memory_ptr) < DICTIONARY_ELEMENT_SIZE)){ /* we need more memory. this makes more and frees the old*/ char* old_memory_ptr = wrd_entry->memory_ptr; long new_size = more_memory_size(wrd_entry->memory_size, wrd_entry->number_of_occurances); /* cprintf(PRINT_AS_INDEXING, "Get more memory %ld bytes for %s\n", new_size, word); */ wrd_entry->memory_ptr = more_memory(wrd_entry->memory_ptr, wrd_entry->memory_size, new_size); wrd_entry->current_memory_ptr = wrd_entry->memory_ptr + /* new offset */ (wrd_entry->current_memory_ptr - old_memory_ptr); /* just being paranoid... no longer illegal if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr) panic("After allocating more memory, the size went to 0"); */ wrd_entry->memory_size = new_size; } /* finished making more memory */ /* add away */ wrd_entry->current_memory_ptr += write_bytes_to_memory(doc_id, DOCUMENT_ID_SIZE, wrd_entry->current_memory_ptr); wrd_entry->current_memory_ptr += write_bytes_to_memory(char_pos, CHARACTER_POSITION_SIZE, wrd_entry->current_memory_ptr); wrd_entry->current_memory_ptr += write_bytes_to_memory(weight + 5, /* add 5 since for the first one */ WEIGHT_SIZE, wrd_entry->current_memory_ptr); } else{ /* The word is already there, * just increment the weight in the record. * This will change when/if position information is kept (for proximity). */ if(wrd_entry->current_memory_ptr == wrd_entry->memory_ptr){ panic("Memory hashtable error. Recorded doc_id %ld, current doc_id %ld\n", wrd_entry->current_doc_id, doc_id); } *(wrd_entry->current_memory_ptr - 1) = add_weight(*(wrd_entry->current_memory_ptr - 1), weight); } } return(0L); } void add_stop_words(the_word_memory_hashtable) word_memory_hashtable *the_word_memory_hashtable; /* add the stop words to the hashtable. this must be done before adding other words */ { init_stop_list(); while(true){ char *word = next_stop_word(); word_entry* wrd_entry; if(NULL == word) break; wrd_entry = look_up_word(word, the_word_memory_hashtable); wrd_entry->number_of_occurances = STOP_WORD_FLAG; } } /* this clears the contents of the word_memory_hashtable */ void clear_word_memory_hashtable(the_word_memory_hashtable) word_memory_hashtable *the_word_memory_hashtable; { memset((char*)the_word_memory_hashtable->contents, 0, ((long)the_word_memory_hashtable->size * sizeof(size_t))); the_word_memory_hashtable->number_of_entries = 0; the_word_memory_hashtable->number_of_words_indexed = 0; } /* Size is in the number of entries. flush_after_n_words sets the hashtable flush parameter. Returns TRUE if it succeeds. */ word_memory_hashtable * init_word_memory_hashtable(size,flush_after_n_words,the_word_memory_hashtable) long size; long flush_after_n_words; word_memory_hashtable* the_word_memory_hashtable; { if(NULL != the_word_memory_hashtable){ /* then displose of the old one */ if(NULL != the_word_memory_hashtable->contents) s_free(the_word_memory_hashtable->contents); if(NULL != the_word_memory_hashtable->word_entry_block) s_free(the_word_memory_hashtable->word_entry_block); flush_word_occurance_buffers(); } the_word_memory_hashtable = (word_memory_hashtable*)s_malloc((size_t)sizeof(word_memory_hashtable)); the_word_memory_hashtable->size = size; the_word_memory_hashtable->word_entry_block_size = size / 2; the_word_memory_hashtable->contents = (word_entry **)s_malloc((size_t)(the_word_memory_hashtable->size * sizeof(size_t))); the_word_memory_hashtable->word_entry_block = (word_entry *)s_malloc((size_t)(the_word_memory_hashtable->word_entry_block_size * sizeof(word_entry))); if(NULL == the_word_memory_hashtable->contents){ panic("Could not malloc for the word hashtable\n"); return(NULL); } /* clear the hashtable the slow by safe way for(i = 0; i < the_word_memory_hashtable->size; i++){ the_word_memory_hashtable->contents[i] = (word_entry*)NULL; } */ clear_word_memory_hashtable(the_word_memory_hashtable); /* add the stopwords to the index */ add_stop_words(the_word_memory_hashtable); the_word_memory_hashtable->flush_after_n_words = flush_after_n_words; the_word_memory_hashtable->growth_factor = 2.0; the_word_memory_hashtable->grow_when_this_full = .5; return(the_word_memory_hashtable); }